




*******************************************************************************
* Initial Set up
*******************************************************************************

clear all
set maxvar 11000



*******************************************************************************
* Program for outputting single numbers
*******************************************************************************
	
* For writing something (the `anything') that is not enclosed by double quotes:
cap program drop write_single_num
program define write_single_num
	syntax anything, Filename(string)
	cap erase "`filename'"
	cap file close temp_file
	file open temp_file using "`filename'", write
	file write temp_file "`anything'"
	file close temp_file

end

* For writing something (the `anything') that IS enclosed by double quotes:
cap program drop write_single_string
program define write_single_string
	syntax anything, Filename(string)
	cap erase "`filename'"
	cap file close temp_file
	file open temp_file using "`filename'", write
	file write temp_file `anything'
	file close temp_file

end

*************
* Read in and initial cleaning
*************

	import excel using "Data/Raw/Wang_et_al_data/Wang_expanded_sample.xlsx", sheet("Component_Emissions") first clear
	
	sum Emissionkgday // does not include zeros
	describe Operator Site TreatmentGroup 
	destring Survey, ignore("Survey ") replace // makes numeric
	rename TankRelation TankRelated 
	rename EmittingComponent Component
	describe, full

	encode Operator, gen(Operator_R)
	drop Operator 
	rename Operator_R Operator_ 
	

***************************************************************************
* Exploring data (no new variables generated in here)
***************************************************************************	

	
	sum Emissionkgd if Survey==1 & TreatmentGroup == "Control", detail
	sum Emissionkgd if Survey==5 & TreatmentGroup == "Control", detail
	sum Emissionkgd if Survey==1 & TreatmentGroup != "Control", detail
	sum Emissionkgd if Survey==5 & TreatmentGroup != "Control", detail

	* Site level tags:
	egen tag_Site = tag(Site)
	
	* Exploring Vent/Leak status
	tab VentLeak, m
	
	tab TreatmentGroup if missing(VentLeak), m // spread across all 
	tab IncludedinAnalysis if missing(VentLeak), m // both yes and no
	sum Emissionkgd if missing(VentLeak) // all are zero emissions
	tab Tag if missing(VentLeak), m // all are missing or NA
	tab SiteType if missing(VentLeak) // mix of gas and oil, MW and SW

	* Cases with missing(VentLeak) are those where there are no site emissions at all
	* replaces VentLeak with "missing" if missing
	egen has_missing_VentLeak = mean(missing(VentLeak)), by(Site Survey)
	tab has_missing_VentLeak  
	drop has_missing_VentLeak 
	
	
	* Makes sure that there is no variation in IncludedinAnalysis within Site 
	egen mean_in_analysis = mean(IncludedinAnalysis=="Yes"), by(Site)
	sum mean_in_analysis if tag_Site, detail
	tab mean_in_analysis, m  
	drop mean_in_analysis
	
	* Makes sure there is no variation in Operator within Site x Survey
	egen sd_Operator = sd(Operator), by(Site Survey)
	tab sd_Operator, m // no-within Site x Survey variation
	drop sd_Operator 
	
	* Drops variables that are not necessary 
	drop tag_Site 
	
	* Whether has an endline survey at all 
	egen has_baseline_survey = max(Survey==1), by(Site)
	egen has_survey_2 = max(Survey==2), by(Site)
	egen has_survey_3 = max(Survey==3), by(Site)
	egen has_survey_4 = max(Survey==4), by(Site)
	egen has_endline_survey = max(Survey==5), by(Site)

	
	* drops these extra variables:
	drop has_baseline_survey has_survey_? has_endline_survey
	
	* keeps only Treatment sites (because control has no tracking)
	drop if TreatmentGroup == "Control"
	* Saves 
	tempfile temp
	save "`temp'"

	
***************************************************************************
* Site-level data read in:
***************************************************************************	

	import excel using "Data/Raw/Wang_et_al_data/Wang_expanded_sample.xlsx", ///
		sheet("Site_Emissions") first clear
	
	* Destrings variables that should be numeric
	foreach var of varlist ProductionCH4103m3month ProductionGas103m3month ///
	          ProductionOilm3month ProductionGasEnergyGJmonth ///
			  ProductionOilEnergyGJmonth {
		replace `var' = "0" if `var'=="NA"
		destring `var', replace ignore(",")
	}

	* Destrings Survey
	destring Survey, ignore("Survey ") replace
	
	* Keeps only baseline 
	keep if inlist(Survey,1) // pre and post only

	* Keeps only non-control sites:
	keep if Treatment != "Control"
	
	* Keeps only variables that we care about 
	keep Site ProductionGas103m3month ProductionOilm3month Site_start_year
	
	duplicates report Site 

	* Saves:
	tempfile site_level_data 
	save "`site_level_data'", replace

**********
* Public tagged data:
*********


	import excel using "Data/Raw/Wang_et_al_data/Wang_2024_original_data.xlsx", ///
		sheet("Tagged_Emissions") first clear

	* What constitutes repair?
	sum Followup if Repaired=="Yes", detail // almost all are zero, but 6 out of 155 are positive
	count if Repaired=="Yes" & Followup>0 & Followup<. 
	list Tag Initial Followup if Repaired=="Yes" & Followup>0
	sum Followup if Repaired=="No", detail   // all are positive
	
	gen change_as_frac = Followup/Initial 
	sum change_as_frac if Repaired=="Yes", detail // 6 positives, 2 where number is greater than 1 (increase)
	sum change_as_frac if Repaired=="No", detail // Some big increases but also some very big decreases
	drop change_as_frac 
	
	* Drops cases where initial is zero
	sum Followup if Initial==0
	drop if Initial==0
	
	* Finds the first observation within the Tag ID variable	
	gen n = _n 
	sort Tag n 
	egen tag_Tag = tag(Tag)
	egen count_Tag = total(1), by(Tag)
	tab count_Tag TreatmentGroup, m
	by Tag: gen n_within_Tag = _n
	count if tag_Tag // 355 -- contrast with 438
	
	
	* Reshapes long to get the order of observations
	keep Tag count_Tag n_within_Tag Repaired TreatmentGroup Initial Followup
	rename Initial reading_1
	rename Followup reading_2
	reshape long reading_, i(Tag n_within_Tag) j(order)
	
	egen group_within = group(Tag reading_)
	egen total_group_within = total(1), by(group_within)
	gen likely_first = order == 1 & total_group_within == 1
	gen likely_last = order == 2 & total_group_within == 1
	
	egen mean_n_within_Tag_if_first_temp = mean(n_within_Tag) if likely_first == 1, by(Tag) 
	egen mean_n_within_Tag_if_first = mean(mean_n_within_Tag_if_first_temp), by(Tag)
	gen is_first_row = n_within_Tag == mean_n_within_Tag_if_first
	drop mean_n_within_Tag_if_first_temp
	
	egen mean_n_within_Tag_if_last_temp = mean(n_within_Tag) if likely_last == 1, by(Tag) 
	egen mean_n_within_Tag_if_last = mean(mean_n_within_Tag_if_last_temp), by(Tag)
	gen is_last_row = n_within_Tag == mean_n_within_Tag_if_last
	drop mean_n_within_Tag_if_last_temp

	* One case where we have a zero reading and then a non-zero reading and then a zero-reading -- 
	* we'll correct by hand:
	replace is_last_row = 1 if Tag==1916 & n_within_Tag==2
	
	* One case where there doesn't seem to be matching:
	list Tag n_within_Tag order reading_ if Tag==2041 // readings don't match consistently
	
	* then sorts:
	gsort Tag -is_first_row +is_last_row n_within_Tag order 
	
	* Double checks that readings for the old follow-up match the reading of the new initial
	count if Tag==Tag[_n+1] & order==2 & order[_n+1]==1   // 68
	tab TreatmentGroup if Tag==Tag[_n+1] & order==2 & order[_n+1]==1 // biannual and triannual cases
	count if Tag==Tag[_n+1] & order==2 & order[_n+1]==1 & reading_ == reading_[_n+1] 
	count if Tag==Tag[_n+1] & order==2 & order[_n+1]==1 & reading_ != reading_[_n+1] 
	egen has_mismatch = max(Tag==Tag[_n+1] & order==2 & order[_n+1]==1 & reading_ != reading_[_n+1]), by(Tag)

	* Then converts to a file that has one observation per date (instead of the matched Initial Follow-up pieces)
	* Drops the order = 
	gsort Tag -is_first_row +is_last_row n_within_Tag order 
	drop if Tag==Tag[_n-1] & order == 1 & Tag != 2041
	gen n = _n
	bysort Tag (n): gen final_order = _n

	replace Repaired = "" if final_order==1
	
	* Hand codes Tag 2041
	replace order = 1 if Tag==2041 & reading==1.93
	replace order = 2 if Tag==2041 & reading==1.08
	replace order = 3 if Tag==2041 & reading==3.25
	replace order = 4 if Tag==2041 & reading==0
	replace Repaired = "No" if Tag==2041 & reading==3.25
	
	sort Tag final_order
	
	* Looks at duplicates -- there are some because of zeros:
	duplicates report Tag reading_
	* but no non-zero duplicates
	duplicates report Tag reading_ if reading_ > 0 // none
	
	* keeps only relevant variables:
	keep Tag final_order reading Repaired TreatmentGroup
	rename reading Emissionkgday
	rename TreatmentGroup TreatmentGroup_tag
	
	* reshapes wide 
	reshape wide Emissionkgday Repaired, i(Tag) j(final_order)
	drop Repaired1 // always missing (by construction)
	rename Emissionkgday1 Emissionkgday_init  
	rename Emissionkgday2 Emissionkgday_followup1 
	rename Emissionkgday3 Emissionkgday_followup2 
	rename Emissionkgday4 Emissionkgday_followup3
	rename Repaired2 Repaired_followup1
	rename Repaired3 Repaired_followup2
	rename Repaired4 Repaired_followup3
	
	* Saves:
	tempfile public_tags 
	save "`public_tags'"
	
	
	
	
	
***************************************************************************
* Constructs component-level data
***************************************************************************	

	use "`temp'"

	* Site-level venting and leakage totals
	egen Tot_emiss_Vent = total(Emissionkgday * (VentLeak == "Vent")), by(Site Survey)
	egen Tot_emiss_Leak = total(Emissionkgday * (VentLeak == "Leak")), by(Site Survey)
	* Site-level number of leaks:
	egen Tot_count_Leak = total(VentLeak == "Leak"), by(Site Survey)
	egen Tot_count_Vent = total(VentLeak == "Vent"), by(Site Survey)
	* Only keeps baseline and endline  -- and will eventually drop endline 
	keep if inlist(Survey, 1, 5)
	keep Survey Site SiteType TreatmentGroup Operator_ Tot_emiss_Vent Tot_emiss_Leak Tot_count_Leak Tot_count_Vent
	duplicates drop 
	duplicates report Site Survey 
	* Saves
	tempfile site_level
	save "`site_level'"
	* Save only survey 1
	keep if Survey==1
	tempfile site_level_baseline 
	save "`site_level_baseline'"
	
	* Meges in the public site data
	merge 1:1 Site using "`site_level_data'"
	drop _merge
	save "`site_level_baseline'", replace 
	
	
	* Now component-level, leaks only
	use "`temp'", clear
	keep if VentLeak == "Leak"

	keep if inlist(Survey, 1, 5)
	
	sum Emissionkgday // the minimum is positive
	destring Tag, replace ignore("NA")
	
	* Gets approximate repair status
	*     If is tagged but does not have endline, then assume repaired 
	*     If is tagged and has non-zero at endline, then assume not repaired 
	*     If is not tagged, then it is unknown whether repaired
	gen has_tag = ~missing(Tag) if Survey==1
	egen has_endline = max(Survey==5) if ~missing(Tag), by(Site Tag)
	
	* Save only survey 1
	keep if Survey==1
	* keeps only component-level variables and Site 
	keep Site Tag Component TankRelated Emissionkgday 
	* Save 
	tempfile leak_component_baseline
	save "`leak_component_baseline'"
	
	* Now merges together at the Site level 
	use "`leak_component_baseline'", clear
	merge m:1 Site using "`site_level_baseline'"
	drop if _merge==2  // There are a number of sites with zero baseline leakage
	drop _merge

	* In preparation for merge, creates a fake tag for the ones that are missing tags -- will help to make merging easier
	replace Tag = -_n if missing(Tag)
	duplicates report Tag  // still a few duplicates. Here, unclear which ones are correct. However, merging with public_tags will help
	duplicates tag Tag, gen(duptag)

	* are there weird variables not matching?
	describe, varlist
	local orig_varlist `r(varlist)'
	describe using "`public_tags'", varlist 
	local tag_varlist `r(varlist)'
	local vars_in_both : list orig_varlist & tag_varlist 
	di "`in_both'"

	
	
	* Merges in info for whether is in the tagged_emissions sheet
	merge m:1 Tag using "`public_tags'"
	
	// Someof these will also be gas gathering systems; some will be tags that only appear later on
	drop if _merge==2
	gen in_public_tag_data = _merge==3
	drop _merge 

	
	
	
	
	* Checks treatment groups:
	tab TreatmentGroup TreatmentGroup_tag // almost complete match 
	list Site Tag if TreatmentGroup != TreatmentGroup_tag & in_public_tag_data // site 105, tag 1988
	
	tab in_public_tag_data if Tag > 0 
	tab in_public_tag_data if duptag & !missing(duptag) // only two cases with in_public_tag_data == 1 matches 
	list Site Tag Emissionkgday Emissionkgday_init if in_public_tag_data & duptag>0 & ~missing(duptag)
	drop if in_public_tag_data & duptag>0 & ~missing(duptag) & abs( Emissionkgday - Emissionkgday_init) > 0.01 // site 105, tag 1988 is the mismatch 
	
	corr Emissionkgday Emissionkgday_init if in_public_tag_data // 100% correlation 
	

	*********************************
	
	* Looks at fraction that can be matched:
	tab in_public_tag_data,   m

	qui count if in_public_tag_data == 1
	local count_tagged = r(N)
	write_single_num `count_tagged', file("Paper/Figures/single_numbers/repair_count_leaks_tracked.tex")
	
	qui count 
	local count_all = r(N)
	write_single_num `count_all', file("Paper/Figures/single_numbers/repair_count_all_leaks.tex")

	count if Repaired_followup1 == "Yes"
	local count_repaired = r(N)
	write_single_num `count_repaired', file("Paper/Figures/single_numbers/repair_count_repaired_leaks.tex")

	count if Repaired_followup1 == "No"
	local count_not_repaired = r(N)
	write_single_num `count_not_repaired', file("Paper/Figures/single_numbers/repair_count_not_repaired_leaks.tex")
	
	local percent_repaired = round(100*`count_repaired'/`count_tagged')
	write_single_num `percent_repaired', file("Paper/Figures/single_numbers/repair_percent_repaired.tex")
	

********************
* Analysis
********************	



	* Fraction repaired:
	tab Repaired_followup1, m
	egen frac_repaired_1 = mean(Repaired_followup1 == "Yes"), by(Site)
	egen frac_notrepaired_1 = mean(Repaired_followup1 == "No"), by(Site)
	egen frac_unknown_1 = mean(missing(Repaired_followup1)), by(Site)
	egen tag_Site = tag(Site)
	gen frac_repaired_1_adj    = frac_repaired_1 / (1 - frac_unknown_1)
	gen frac_notrepaired_1_adj = frac_notrepaired_1 / (1 - frac_unknown_1)
	egen count_not_missing = total(~missing(Repaired_followup1)), by(Site)
	sum frac_repaired_1_adj if tag_Site, detail
	tab frac_repaired_1_adj if tag_Site & count_not_missing >= 2
	
	* Outcome variables --- best approximations of whether repaired:
	* whether repaired by next survey:
	gen     repair_v1 = Repaired_followup1=="Yes" if !missing(Repaired_followup1)
	* whether repaired by last survey observed
	gen     repair_v2 = Repaired_followup1=="Yes" if !missing(Repaired_followup1)
	replace repair_v2 = Repaired_followup2=="Yes" if !missing(Repaired_followup2)
	replace repair_v2 = Repaired_followup3=="Yes" if !missing(Repaired_followup3)
	* whether ever repaired:
	gen     repair_v3 = Repaired_followup1=="Yes" if !missing(Repaired_followup1)
	replace repair_v3 = 1                         if Repaired_followup2=="Yes"
	replace repair_v3 = 1                         if Repaired_followup3=="Yes"
	* whether zero emission by next follow-up reading
	gen     emiss0_v1 = Emissionkgday_followup1 == 0 if !missing(Emissionkgday_followup1)
	* whether zero emissions observed by last date
	gen     emiss0_v2 = Emissionkgday_followup1 == 0 if !missing(Emissionkgday_followup1)
	replace emiss0_v2 = Emissionkgday_followup2 == 0 if !missing(Emissionkgday_followup2)
	replace emiss0_v2 = Emissionkgday_followup3 == 0 if !missing(Emissionkgday_followup3)
	* whether zero emissions ever observed
	gen     emiss0_v3 = Emissionkgday_followup1 == 0 if !missing(Emissionkgday_followup1)
	replace emiss0_v3 = 1                            if Emissionkgday_followup2==0
	replace emiss0_v3 = 1                            if Emissionkgday_followup3==0
	
	* how well correlated:
	corr repair_v? emiss0_v? 
	
	tab repair_v1 
	count if ~missing(repair_v1)
	codebook Operator if ~missing(repair_v1)
	codebook Site if ~missing(repair_v1)
	
	
	*********************
	* Control variables:

	gen Other_emiss_Leak = Tot_emiss_Leak - Emissionkgday
	gen Other_emiss_Total = max(Other_emiss_Leak + Tot_emiss_Vent, 0)
	sum Tot_emiss_Vent Other_emiss_Leak , detail
	gen Other_count_Leak = Tot_count_Leak - 1
	
	encode Component, gen(Component_R)
	encode TankRelated, gen(TankRelated_R)
	encode TreatmentGroup, gen(TreatmentGroup_R)
	encode SiteType, gen(SiteType_R)
	
	gen av_Other_emiss_Leak = Other_emiss_Leak / Other_count_Leak 
	gen av_emiss_Vent = Tot_emiss_Vent / Tot_count_Vent

	gen log_Emission = log(Emissionkgday)
	gen log_Tot_emiss_Vent = log(Tot_emiss_Vent)
	gen log_Tot_emiss_Leak = log(Tot_emiss_Leak)
	gen log_Other_emiss_Leak = log(Other_emiss_Leak)
	
	gen log1_Tot_emiss_Vent = log(1 + Tot_emiss_Vent)
	gen log1_Tot_emiss_Leak = log(1 + Tot_emiss_Leak)
	gen log1_Other_emiss_Leak = log(1 + Other_emiss_Leak)

	
	gen log_count_Leak = log(Tot_count_Leak)
	gen log_Tot_count_Vent = log(Tot_count_Vent)

	gen log1_Oil = log(ProductionOilm3month + 1 )
	gen log1_Gas = log(ProductionGas103m3month)
	

	gen Biannual = TreatmentGroup == "Bi-Annual"
	gen Triannual = TreatmentGroup == "Tri-Annual"
	
	
	gen n = _n 
	qui sum Other_emiss_Leak
	local n_count = r(N)
	sort Other_emiss_Leak 
	gen quan_Other_emiss_Leak = _n/`n_count' if ~missing(Other_emiss_Leak)
	sort Tot_emiss_Vent 
	gen quan_Tot_emiss_Vent = _n/`n_count' if ~missing(Tot_emiss_Vent)
	sort Emissionkgday 
	gen quan_Emissionkgday = _n/`n_count' if ~missing(Emissionkgday)
	
	
	gen older_than_20 = Site_start_year <= 1998
	
	***************** 
	* Labels:
	label var Emissionkgday "Component leakage" 
	label var Other_emiss_Leak "Total other leakage"
	label var Tot_emiss_Vent "Total venting"
	
	label var log_Emission "Baseline log(component leakage)"
	label var log_Other_emiss_Leak "Baseline log(total other leakage)"
	label var log_Tot_emiss_Vent "Baseline log(total venting)"
	label var log_count_Leak "Baseline log(count leaks)"
	
	label var log1_Other_emiss_Leak "Baseline log(1 + total other leakage)"
	label var log1_Tot_emiss_Vent "Baseline log(1 + total venting)"
	
	label var Site_start_year "Site establishment year"
	label var older_than_20 "1(20+ years old)"
	

	*****************
	* What if we account for the total size of emissions at the site? 
	reg repair_v1 log_Emission log1_Other_emiss_Leak log1_Tot_emiss_Vent if in_public_tag_data == 1, cluster(Operator)
	estimates sto reg1 

	reg repair_v1 log_Emission log1_Other_emiss_Leak log1_Tot_emiss_Vent older_than_20 if in_public_tag_data == 1, cluster(Operator)
	estimates sto reg2 
	
	reg repair_v1 log_Emission log1_Other_emiss_Leak log1_Tot_emiss_Vent older_than_20 i.Component_R Biannual Triannual  if in_public_tag_data == 1, cluster(Operator)
	estadd local has_controls "Y"
	estimates sto reg3 
	
	
	tab older_than_20 if e(sample)
	
	
	#delimit ;
	esttab reg1 reg2 reg3 
		using "Paper/Figures/component_repair_regressions.tex",
		order(log_Emission log1_Other_emiss_Leak log1_Tot_emiss_Vent older_than_20  Biannual Triannual )
		keep(log_Emission log1_Other_emiss_Leak log1_Tot_emiss_Vent older_than_20 Biannual Triannual  _cons)
		b(%8.3f) se(%8.3f) nogaps mtitles("1(repaired)" "1(repaired)" "1(repaired)")
		replace obslast scalars("has_controls Controls" "r2 \hline R Squared" ) numbers
		se compress label nonotes nostar /* star(* .1 ** .05 *** .01) */;
	#delimit cr

